
import requests
from lxml import html
import base64
'''
根据一个url获取国家政策的一个爬虫
网址例子：http://www.gov.cn/zhengce/content/2021-03/09/content_5591819.htm
'''
def get_first_url(url):
    params = {"sn": "a14062711010650606ss9p000000", "size": "0"}
    headers = {
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/84.0.4147.105 Safari/537.36 Edg/84.0.522.49"
    }
    response = requests.get(url=url, params=params, headers=headers)
    response.encoding = "utf-8"
    pages_text = response.text
    etree = html.etree
    tree = etree.HTML(pages_text)
    zhuti= tree.xpath('//*[@class="article oneColumn pub_border"]/h1/text()')#标题
    #zhuti= tree.xpath('//table[@style="width:860px;margin:0 auto;margin-top:12px;"]/tbody/tr[3]/td[2]/text()')#标题
    shijian = tree.xpath('//*[@class="pages-date"]/text()')#时间
    # neirong = tree.xpath('//*[@id="UCAP-CONTENT"]/p/text()')#内容
    neirong = tree.xpath('/html/body/div[3]/div[2]')#内容
    # nr = []
    # for x in neirong:
    #     nr.append(x.replace(u'\u3000', u' ').replace(u'\xa0', u' '))
    # neirong1 = [x.strip() for x in nr if x.strip() != '']
    key = base64.b64encode(url.encode("utf8"))
    # 'ml/body/div[6]/div[3]/table[1]/tbody/tr/td/table[1]/tbody/tr[1]/td[1]/b'
    myWant = html.tostring(neirong[0], encoding='utf-8').decode('utf-8')
    # print(key)
    # print(zhuti)
    # print(shijian)
    print(myWant)
 